/**********************************************************************/
/*
   	Author: Karan Makkar, Michelle Han, Adapted from Nikhil Kumar
	Last Updated: June 2024
   	Description: Cleans raw September 2020 SUSENAS data. Outputs deidentified
	cleaned SUSENAS data merged with PMO data.

	This cleaning file should output 2 different datasets:
	1. Cleaned Susenas data:
	sus_sep20_deid_clean
	2. Subset of Susenas person-batch data matched with PMO:
	sus_sep20_deid_clean_merged

*/
/**********************************************************************/

*******************************************
* Setup

* include filepaths 
  	if "$master_run" !="1" include "./Do/SET_FILEPATHS.do"
	cap log close
	global prefix: display %tdCYND td(`c(current_date)')
	log using "$KP_logs/${prefix}_clean_SUSENAS_sep2020.txt", text replace

	use "$KP_deid_susenas/Raw/Individual/SUSENAS_SEP20_deid.dta", clear
	rename renum RENUM
	merge m:1 RENUM using "$KP_deid_susenas/Raw/Household/susenas20sep_rt_diseminasi.dta", assert(3) nogen

*******************************************
* Recode yes/no questions 

	la def yesno 0 "No" 1 "Yes"

	local yesno_vars m506 ///
					 M703* ///
					 M705A* ///
					 M706* ///
					 M707* 

	foreach var of varlist `yesno_vars' {
		assert `var' == 1 | `var' == 5 | `var' == .
		recode `var' (1=1) (5=0)
		replace `var' = 0 if mi(`var') // for questions dependent on previous skip logic
		la val `var' yesno
	}

*******************************************
* Demographics

* Gender
	gen gender = m405 == 1
	la def gender 1 "Male" 0 "Female"
	la val gender gender

	gen female = m405 == 2
	la def female 1 "Female" 0 "Male"
	la val female female

* Number of HH members
	gen hh_size_sus = m301

* Marital Status
	gen marital_status = m404

* Age
	gen age_sus = m407
	recode age_sus (0/20 = 1) (21/40 = 2) (41/60 = 3) (61/80 = 4) (81/100 = 5), gen(age_cat)

* Num Children
	gen child = age_sus < 18
	bysort RENUM: egen num_child = total(child)
	
* Year DOB
    gen year_dob_sus = m406c

* Month DOB
    gen month_dob_sus = m406b
	replace month_dob_sus = . if month_dob_sus == 98

* Day DOB	
	gen day_dob_sus = m406a
	replace day_dob_sus = . if day_dob_sus == 98

* Dummy for age <=30 
	gen young = 0 
	replace young = 1 if age_sus <= 30
	la def young 0 "Over 30 Years Old" 1 "30 and Under"
	la val young young 
	
* Urban/Rural
	gen urban_sus = m105 == 1

* Education (translated to years of schooling)
* Note that the question for schooling in Sept '20 is different from Mar '21
	gen educ = m502
	gen school_years = .
	replace school_years = 3 if educ == 1 // Didn't finish elementary
	replace school_years = 6 if educ == 2 // SD
	replace school_years = 9 if educ == 3 // Junior High
	replace school_years = 12 if educ == 4 // SMA
	replace school_years = 16 if educ == 5 // College/University

* Dummy for educated (graduating high school)
	gen educated = school_years > 12 
	la def educated 0 "Less than 12 Years Schooling" 1 "12 or More Years of Schooling"
	la val educated educated

* Still in school dummy
	gen inschool = m501 ==2 if !mi(m501)

* Relation to Head of Household
    gen relation_hh_head = m403

*******************************************
* Consumption

* Consumption
* 901-food
	gen cons_food = M901/M301
	gen lcon_food = log(cons_food)

	gen cons_adj_food = cons_food
	sum cons_adj_food if cons_adj_food > 0, d
	replace cons_adj_food = r(p99) if cons_adj_food > r(p99) & !mi(cons_adj_food)
	replace cons_adj_food = r(p1) if cons_adj_food < r(p1) & !mi(cons_adj_food)
	replace cons_adj_food = cons_adj_food / 1000
	la var cons_adj_food "Adjusted Food Consumption (in 1000s of rupiah, trimmed of 1st and 99th pct)"
	sum cons_adj_food, d
	gen lcon_adj_food = log(cons_adj_food)

* 902--non-food
	gen cons_nfood = M902/M301
	gen lcon_nfood = log(cons_nfood)

	gen cons_adj_nfood = cons_nfood
	sum cons_adj_nfood if cons_adj_nfood > 0, d
	replace cons_adj_nfood = r(p99) if cons_adj_nfood > r(p99) & !mi(cons_adj_nfood)
	replace cons_adj_nfood = r(p1) if cons_adj_nfood < r(p1) & !mi(cons_adj_nfood)
	replace cons_adj_nfood = cons_adj_nfood / 1000
	la var cons_adj_nfood "Adjusted Non-Food Consumption (in 1000s of rupiah, trimmed of 1st and 99th pct)"
	sum cons_adj_nfood, d
	gen lcon_adj_nfood = log(cons_adj_nfood)

* tot consumption
	gen tot_cons = cons_food + cons_nfood
	gen ltot_cons = log(tot_cons)

	gen tot_adj_cons = tot_cons
	sum tot_adj_cons if tot_adj_cons > 0, d
	replace tot_adj_cons = r(p99) if tot_adj_cons > r(p99) & !mi(tot_adj_cons)
	replace tot_adj_cons = r(p1) if tot_adj_cons < r(p1) & !mi(tot_adj_cons)
	replace tot_adj_cons = tot_adj_cons / 1000
	la var tot_adj_cons "Adjusted Total Consumption (in 1000s of rupiah, trimmed of 1st and 99th pct)"	
	sum tot_adj_cons, d
	gen ltot_adj_cons = log(tot_adj_cons)


*******************************************
* Employment

* Employed in week prior to survey
	gen employment_status = m505
	gen self_employed = m505 == 1
	gen business_owner = m505 == 2 | m505 == 3
	gen perm_worker = m505 == 4
	gen family_unpaid = m505 == 6
	gen self_emp_bus_owner = (m505 == 1 | m505 == 2 | m505 == 3) 

	gen employed = inlist(employment_status, 1, 2, 3, 4, 5)
	label define emp 1 "Self-employed" 2 "Business with temp emp" ///
			3 "Business with perm emp" ///
			4 "Permanent Worker" 5 "Freelancer" 6 "Family/unpaid worker", replace
	lab val employment_status emp
	tab employed employment_status, m row

	gen temp_not_work = m503
	recode temp_not_work (1=1) (5=0)
	replace temp_not_work = 0 if employed
	tab temp_not_work employed, m
	tab temp_not_work employment_status, m

	bys RENUM: egen hh_business = max(business_owner)
	bys RENUM: egen hh_emp_self = max(self_employed)

	rename m504 ind_code


*******************************************
* Social Assistance

*Kartu PK
* 506:  did you get Kartu PK
	gen get_pk = m506
	la var get_pk "Received Kartu Prakerja"
	tab get_pk

* 705 - PKH recipient (either march or august 2020)
	gen receive_pkh_year = M705AI == 1
	replace receive_pkh_year = 1 if M705AII == 1
	la val receive_pkh_year yesno 
	la var receive_pkh_year "Household received PKH in March or August '20"

	gen receive_pkh_now = M705AII == 1
	replace receive_pkh_now = 1 if M705AII == 1
	la val receive_pkh_now yesno 
	la var receive_pkh_now "Household received PKH in March or August '20"

* 705b:  PKH Receipt Method
	gen pkh_receipt_method = M705B

* Household module
* 703 and 706:  Did you get other types of social assistance.  Drop central government (A), and look at others.  Central (A), local govt (B) Company (c), NGOs that serve households (d)  E--other households. (f) other countries.  For each of those, there's also a yes/no of if it was related to COVID.
	gen assist_vet = M703A
	gen assist_pen = M703B
	gen assist_workinsur = M703C 
	gen assist_lifeinsur = M703D
	gen assist_sevpay = M703E
	
	gen assist_govt = M706A1
	gen assist_covid_govt = M706A2

	gen assist_loc = M706B1
	gen assist_covid_loc = M706B2

	gen assist_firm = M706C1
	gen assist_covid_firm = M706C2

	gen assist_ngo = M706D1
	gen assist_covid_ngo = M706D2

	gen assist_hh = M706E1
	gen assist_covid_hh = M706E2

	gen assist_foreign = M706F1
	gen assist_covid_foreign = M706F2

* m707:
* Column 2: non-cash food assistance (BPNT)
* Column 3: social cash assistance (BST) due to Covid
* Column 4: direct cash assistance (BLT) Village Fund due to Covid
	egen ever_BPNT = rowtotal(M707*2)
	replace ever_BPNT = 1 if ever_BPNT > 1

*  receive BPNT by month 
	gen aug20_BPNT = M707A2
	tab aug20_BPNT

	gen jul20_BPNT = M707B2
	tab jul20_BPNT

	gen jun20_BPNT = M707C2
	tab jun20_BPNT

	gen may20_BPNT = M707D2
	tab may20_BPNT

	egen receive_BST = rowtotal(M707*3)
	replace receive_BST = 1 if receive_BST >= 1
	la val receive_BST yesno
	tab receive_BST

	egen receive_BLTDD = rowtotal(M707*4)
	replace receive_BLTDD = 1 if receive_BLTDD > 1
	la val receive_BLTDD yesno
	tab receive_BLTDD

	*  receive BLTDD by month 
	gen aug20_BLTDD = M707A4
	tab aug20_BLTDD

	gen jul20_BLTDD = M707B4
	tab jul20_BLTDD

	gen jun20_BLTDD = M707C4
	tab jun20_BLTDD

	gen may20_BLTDD = M707D4
	tab may20_BLTDD

	gen apr20_BLTDD = M707E4
	tab apr20_BLTDD

	*******************************************
	* House Characteristics
	gen house_area = M601B

	gen house_pln_elec = M602
	gen has_pln_elec = inlist(M602,1,3) if !mi(M602)

	*******************************************
	* check duplicates
  	duplicates tag anon_id4, gen(anon_id4_dupe)
  	tab anon_id4_dupe

	* drop duplicates 
  	drop if anon_id4_dupe != 0 & !missing(anon_id4)
  	drop anon_id4_dupe

	/*----------------------------------------------------*/
	            * Export cleaned data
	/*----------------------------------------------------*/

	rename fwt_pnl weight
	datasignature
    	if "`r(datasignature)'" == "257968:188(33950):2889118331:3200101656" {
    	save "$KP_deid_susenas/Clean/sus_sep20_deid_clean.dta", replace
      	}
  	else {
    	di as err "Careful, your machine produces a different dataset"
    	stop
		}
	

	/*----------------------------------------------------*/
	          * Merge with admin data and export
	/*----------------------------------------------------*/

	gen sus_round = 5

	keep if !mi(anon_id4)
	drop m4* m5* date_incentive

	* Merge with PMO data
	merge 1:m anon_id4 using "$KP_deid_admin/Clean/pmo_b1-22_clean_long_deid.dta", nogen keep(3)
	
	rename urban urban_pmo 
	la var urban_pmo "Urban (PMO Data, based on city ID)"
	
	rename urban_sus urban 
	la var urban "Urban (SUSENAS survey data)"
		
* generate variable to indicate date_incentive is within 4 months of survey date 
	gen sus_end_date = date("01sep2020","DMY")
	format sus_end_date %td 

	gen sus_start_date = date("01jun2020","DMY")
	format sus_start_date %td

	gen incentive_near_sus = inrange(date_incentive,sus_start_date,sus_end_date)
	tab incentive_near_sus 


	drop sus_start_date sus_end_date

	compress
	datasignature
    	if "`r(datasignature)'" == "38717:242(39259):3058940474:3884257506" {
    	save "$KP_deid_susenas/Clean/sus_sep20_deid_clean_merged.dta", replace
      	}
  	else {
    	di as err "Careful, your machine produces a different dataset"
    	stop
		}
	


// end of dofile